From 80a9af642f279463e302bf0b30dbbc25472a0b25 Mon Sep 17 00:00:00 2001 From: "mafetter@fleming.research" Date: Thu, 17 Mar 2005 12:25:14 +0000 Subject: [PATCH] bitkeeper revision 1.1236.32.10 (4239772aZ9Ayf3Cwr_6ubXtSI1oZ9Q) Initial commit for trying to get a translated dom0 up and running. Signed-off-by: michael.fetterman@cl.cam.ac.uk --- xen/arch/x86/audit.c | 118 +++++++++-- xen/arch/x86/domain.c | 22 +- xen/arch/x86/mm.c | 2 +- xen/arch/x86/shadow.c | 321 +++++++++++++++++++++++++---- xen/arch/x86/traps.c | 2 +- xen/arch/x86/vmx_io.c | 2 +- xen/arch/x86/x86_32/domain_build.c | 26 ++- xen/arch/x86/x86_64/domain_build.c | 2 +- xen/include/asm-x86/domain.h | 4 +- xen/include/asm-x86/mm.h | 12 +- xen/include/asm-x86/shadow.h | 31 +-- xen/include/xen/perfc_defn.h | 2 + 12 files changed, 445 insertions(+), 99 deletions(-) diff --git a/xen/arch/x86/audit.c b/xen/arch/x86/audit.c index 7fcc8c4377..e7d8b9aaf6 100644 --- a/xen/arch/x86/audit.c +++ b/xen/arch/x86/audit.c @@ -36,6 +36,7 @@ static int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; static int l1, l2, oos_count, page_count; #define FILE_AND_LINE 0 +//#define MFN_TO_WATCH 0x4700 #if FILE_AND_LINE #define adjust(_p, _a) _adjust((_p), (_a), __FILE__, __LINE__) @@ -51,9 +52,17 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) { int errors = 0; int shadow_enabled = shadow_mode_enabled(d) ? 1 : 0; + int l2limit; void _adjust(struct pfn_info *page, int adjtype ADJUST_EXTRA_ARGS) { +#ifdef MFN_TO_WATCH + if (page_to_pfn(page) == MFN_TO_WATCH) + { + APRINTK("adjust(mfn=%p, dir=%d, adjtype=%d) MFN_TO_WATCH", + page_to_pfn(page), dir, adjtype); + } +#endif if ( adjtype ) { // adjust the type count @@ -97,7 +106,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) if ( count < 0 ) { - APRINTK("Audit %d: general count went below zero pfn=%x t=%x ot=%x", + APRINTK("Audit %d: general count went below zero mfn=%x t=%x ot=%x", d->id, page-frame_table, page->u.inuse.type_info, page->tlbflush_timestamp); @@ -105,7 +114,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) } else if ( (count & ~PGT_count_mask) != 0 ) { - APRINTK("Audit %d: general count overflowed pfn=%x t=%x ot=%x", + APRINTK("Audit %d: general count overflowed mfn=%x t=%x ot=%x", d->id, page-frame_table, page->u.inuse.type_info, page->tlbflush_timestamp); @@ -115,17 +124,12 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) page->count_info += dir; } - void adjust_l2_page(unsigned long mfn, int adjtype) + void adjust_l2_page(unsigned long mfn) { unsigned long *pt = map_domain_mem(mfn << PAGE_SHIFT); - int i, limit; - - if ( shadow_mode_external(d) ) - limit = L2_PAGETABLE_ENTRIES; - else - limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; + int i; - for ( i = 0; i < limit; i++ ) + for ( i = 0; i < l2limit; i++ ) { if ( pt[i] & _PAGE_PRESENT ) { @@ -180,7 +184,61 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) } } - adjust(l1page, adjtype); + adjust(l1page, !shadow_enabled); + } + } + + if ( shadow_mode_translate(d) && !shadow_mode_external(d) ) + { + unsigned long hl2mfn = + pt[l2_table_offset(LINEAR_PT_VIRT_START)] >> PAGE_SHIFT; + struct pfn_info *hl2page = pfn_to_page(hl2mfn); + adjust(hl2page, 0); + } + + unmap_domain_mem(pt); + } + + void adjust_hl2_page(unsigned long hl2mfn) + { + unsigned long *pt = map_domain_mem(hl2mfn << PAGE_SHIFT); + int i; + + for ( i = 0; i < l2limit; i++ ) + { + if ( pt[i] & _PAGE_PRESENT ) + { + unsigned long gmfn = pt[i] >> PAGE_SHIFT; + struct pfn_info *gpage = pfn_to_page(gmfn); + + if ( gmfn < 0x100 ) + { + lowmem_mappings++; + continue; + } + + if ( gmfn > max_page ) + { + io_mappings++; + continue; + } + + if ( noisy ) + { + if ( page_get_owner(gpage) != d ) + { + printk("Audit %d: [hl2mfn=%p,i=%x] Skip foreign page " + "dom=%p (id=%d) mfn=%p c=%08x t=%08x\n", + d->id, hl2mfn, i, + page_get_owner(gpage), + page_get_owner(gpage)->id, + gmfn, + gpage->count_info, + gpage->u.inuse.type_info); + continue; + } + } + adjust(gpage, 0); } } @@ -281,13 +339,17 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) case PGT_snapshot: break; case PGT_l1_shadow: - case PGT_hl2_shadow: adjust_l1_page(smfn); if ( page->u.inuse.type_info & PGT_pinned ) adjust(page, 0); break; + case PGT_hl2_shadow: + adjust_hl2_page(smfn); + if ( page->u.inuse.type_info & PGT_pinned ) + adjust(page, 0); + break; case PGT_l2_shadow: - adjust_l2_page(smfn, 0); + adjust_l2_page(smfn); if ( page->u.inuse.type_info & PGT_pinned ) adjust(page, 0); break; @@ -317,6 +379,9 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) if ( !(oos->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ) adjust(pfn_to_page(oos->writable_pl1e >> PAGE_SHIFT), 0); + if ( oos->snapshot_mfn != SHADOW_SNAPSHOT_ELSEWHERE ) + adjust(pfn_to_page(oos->snapshot_mfn), 0); + oos = oos->next; oos_count++; } @@ -400,7 +465,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) adjust(page, 1); if ( page->u.inuse.type_info & PGT_validated ) - adjust_l2_page(mfn, 1); + adjust_l2_page(mfn); break; @@ -468,6 +533,11 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) } } + if ( shadow_mode_external(d) ) + l2limit = L2_PAGETABLE_ENTRIES; + else + l2limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; + adjust_for_pgtbase(); adjust_guest_pages(); @@ -484,7 +554,7 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy) #ifndef NDEBUG -void _audit_domain(struct domain *d, int flags, const char *file, int line) +void _audit_domain(struct domain *d, int flags) { void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn, unsigned long mfn) @@ -569,6 +639,14 @@ void _audit_domain(struct domain *d, int flags, const char *file, int line) struct pfn_info *page; int errors = 0; + if ( (d != current->domain) && shadow_mode_translate(d) ) + { + printk("skipping audit domain of translated domain %d " + "from other context\n", + d->id); + return; + } + if ( d != current->domain ) domain_pause(d); synchronise_pagetables(~0UL); @@ -740,11 +818,10 @@ void _audit_domain(struct domain *d, int flags, const char *file, int line) page_type = a->gpfn_and_flags & PGT_type_mask; switch ( page_type ) { - case PGT_snapshot: - // XXX -- what should we check here? - break; case PGT_l1_shadow: case PGT_l2_shadow: + case PGT_hl2_shadow: + case PGT_snapshot: if ( ((page->u.inuse.type_info & PGT_type_mask) != page_type ) || (page->count_info != 0) ) { @@ -756,7 +833,6 @@ void _audit_domain(struct domain *d, int flags, const char *file, int line) } break; - case PGT_hl2_shadow: // haven't thought about this case yet. default: BUG(); break; @@ -781,9 +857,9 @@ void _audit_domain(struct domain *d, int flags, const char *file, int line) spin_unlock(&d->page_alloc_lock); if ( !(flags & AUDIT_QUIET) ) - printk("Audit dom%d (%s:%d) Done. " + printk("Audit dom%d Done. " "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n", - d->id, file, line, page_count, oos_count, l1, l2, ctot, ttot ); + d->id, page_count, oos_count, l1, l2, ctot, ttot); if ( !(flags & AUDIT_ALREADY_LOCKED) ) shadow_unlock(d); diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 029d5fd5a4..a2c3ba4d73 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -344,8 +344,6 @@ static int vmx_final_setup_guest(struct exec_domain *ed, shadow_mode_enable(ed->domain, SHM_enable|SHM_translate|SHM_external); } - update_pagetables(ed); - return 0; out: @@ -416,7 +414,7 @@ int arch_final_setup_guest( ed->arch.failsafe_address = c->failsafe_callback_eip; phys_basetab = c->pt_base; - ed->arch.guest_table = ed->arch.phys_table = mk_pagetable(phys_basetab); + ed->arch.guest_table = mk_pagetable(phys_basetab); if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d, PGT_base_page_table) ) @@ -435,8 +433,22 @@ int arch_final_setup_guest( } #ifdef CONFIG_VMX - if (c->flags & ECF_VMX_GUEST) - return vmx_final_setup_guest(ed, c); + if ( c->flags & ECF_VMX_GUEST ) + { + int error; + + // VMX uses the initially provided page tables as the P2M map. + // + // XXX: This creates a security issue -- Xen can't necessarily + // trust the VMX domain builder. Xen should validate this + // page table, and/or build the table itself, or ??? + // + if ( !pagetable_val(d->arch.phys_table) ) + d->arch.phys_table = ed->arch.guest_table; + + if ( (error = vmx_final_setup_guest(ed, c)) ) + return error; + } #endif update_pagetables(ed); diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 19b01667d4..dc42610662 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -268,7 +268,7 @@ int map_ldt_shadow_page(unsigned int off) if ( unlikely(shadow_mode_enabled(d)) ) { shadow_lock(d); - shadow_remove_all_write_access(d, PGT_l1_shadow, PGT_l1_shadow, gpfn); + shadow_remove_all_write_access(d, PGT_l1_shadow, PGT_l1_shadow, gpfn, gmfn); } res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page); diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index 67aea9110e..e1868e4c7e 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -71,10 +71,10 @@ shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, max_type = PGT_l1_shadow; } FSH_LOG("shadow_promote gpfn=%p gmfn=%p nt=%p min=%p max=%p", - gmfn, gmfn, new_type, min_type, max_type); + gpfn, gmfn, new_type, min_type, max_type); if ( min_type <= max_type ) - shadow_remove_all_write_access(d, min_type, max_type, gpfn); + shadow_remove_all_write_access(d, min_type, max_type, gpfn, gmfn); // To convert this page to use as a page table, the writable count // should now be zero. Test this by grabbing the page as an page table, @@ -257,7 +257,7 @@ alloc_shadow_page(struct domain *d, break; } - set_shadow_status(d, gpfn, smfn, psh_type); + set_shadow_status(d, gpfn, gmfn, smfn, psh_type); if ( pin ) shadow_pin(smfn); @@ -567,7 +567,7 @@ static void alloc_monitor_pagetable(struct exec_domain *ed) // map the phys_to_machine map into the Read-Only MPT space for this domain mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = - mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR); + mk_l2_pgentry(pagetable_val(d->arch.phys_table) | __PAGE_HYPERVISOR); ed->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT); ed->arch.monitor_vtable = mpl2e; @@ -607,9 +607,79 @@ void free_monitor_pagetable(struct exec_domain *ed) ed->arch.monitor_vtable = 0; } +static int +alloc_p2m_table(struct domain *d) +{ + struct list_head *list_ent; + struct pfn_info *page, *l2page, *l1page; + l2_pgentry_t *l2, l2e, last_l2e = mk_l2_pgentry(0); + l1_pgentry_t *l1 = NULL; + unsigned long va, mfn, pfn; + + l2page = alloc_domheap_page(NULL); + if ( !l2page ) + return 0; + d->arch.phys_table = mk_pagetable(page_to_pfn(l2page) << PAGE_SHIFT); + l2 = map_domain_mem(page_to_pfn(l2page) << PAGE_SHIFT); + memset(l2, 0, PAGE_SIZE); + + list_ent = d->page_list.next; + while ( list_ent != &d->page_list ) + { + page = list_entry(list_ent, struct pfn_info, list); + mfn = page_to_pfn(page); + pfn = machine_to_phys_mapping[mfn]; + ASSERT(pfn != INVALID_M2P_ENTRY); + ASSERT(pfn < (1u<<20)); + + va = pfn << PAGE_SHIFT; + if ( !l2_pgentry_val(l2e = l2[l2_table_offset(va)]) ) + { + l1page = alloc_domheap_page(NULL); + if ( !l1page ) + return 0; + l2e = l2[l2_table_offset(va)] = + mk_l2_pgentry((page_to_pfn(l1page) << PAGE_SHIFT) | + __PAGE_HYPERVISOR); + } + + if ( l2_pgentry_val(last_l2e) != l2_pgentry_val(l2e) ) + { + if ( l1 ) + unmap_domain_mem(l1); + l1 = map_domain_mem(l2_pgentry_val(l2e) & PAGE_MASK); + last_l2e = l2e; + } + + l1[l1_table_offset(va)] = mk_l1_pgentry((mfn << PAGE_SHIFT) | + __PAGE_HYPERVISOR); + list_ent = page->list.next; + } + + if ( l1 ) + unmap_domain_mem(l1); + unmap_domain_mem(l2); + + return 1; +} + +static void +free_p2m_table(struct domain *d) +{ + // uh, this needs some work... :) + BUG(); +} + int __shadow_mode_enable(struct domain *d, unsigned int mode) { struct exec_domain *ed; + int new_modes = (mode & ~d->arch.shadow_mode); + + // Gotta be adding something to call this function. + ASSERT(new_modes); + + // can't take anything away by calling this function. + ASSERT(!(d->arch.shadow_mode & ~mode)); for_each_exec_domain(d, ed) { @@ -670,8 +740,9 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) } } - if ( !d->arch.shadow_ht ) + if ( new_modes & SHM_enable ) { + ASSERT( !d->arch.shadow_ht ); d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets); if ( d->arch.shadow_ht == NULL ) goto nomem; @@ -680,8 +751,9 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) shadow_ht_buckets * sizeof(struct shadow_status)); } - if ( shadow_mode_log_dirty(d) && !d->arch.shadow_dirty_bitmap ) + if ( new_modes & SHM_log_dirty ) { + ASSERT( !d->arch.shadow_dirty_bitmap ); d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63; d->arch.shadow_dirty_bitmap = xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size / @@ -695,8 +767,28 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) d->arch.shadow_dirty_bitmap_size/8); } + if ( new_modes & SHM_translate ) + { + if ( !(new_modes & SHM_external) ) + { + ASSERT( !pagetable_val(d->arch.phys_table) ); + if ( !alloc_p2m_table(d) ) + { + printk("alloc_p2m_table failed (out-of-memory?)\n"); + goto nomem; + } + } + else + { + // external guests provide their own memory for their P2M maps. + // + unsigned long mfn = pagetable_val(d->arch.phys_table)>>PAGE_SHIFT; + ASSERT( d == page_get_owner(&frame_table[mfn]) ); + } + } + printk("audit1\n"); - _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__); + _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK); printk("audit1 done\n"); // Get rid of any shadow pages from any previous shadow mode. @@ -704,11 +796,12 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) free_shadow_pages(d); printk("audit2\n"); - _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__); + _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK); printk("audit2 done\n"); // Turn off writable page tables. // It doesn't mix with shadow mode. + // And shadow mode offers a superset of functionality. // vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables); @@ -749,15 +842,27 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) audit_adjust_pgtables(d, 1, 1); printk("audit3\n"); - _audit_domain(d, AUDIT_ALREADY_LOCKED, __FILE__, __LINE__); + _audit_domain(d, AUDIT_ALREADY_LOCKED); printk("audit3 done\n"); return 0; nomem: - if ( d->arch.shadow_ht != NULL ) + if ( (new_modes & SHM_enable) && (d->arch.shadow_ht != NULL) ) + { xfree(d->arch.shadow_ht); - d->arch.shadow_ht = NULL; + d->arch.shadow_ht = NULL; + } + if ( (new_modes & SHM_log_dirty) && (d->arch.shadow_dirty_bitmap != NULL) ) + { + xfree(d->arch.shadow_dirty_bitmap); + d->arch.shadow_dirty_bitmap = NULL; + } + if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) && + pagetable_val(d->arch.phys_table) ) + { + free_p2m_table(d); + } return -ENOMEM; } @@ -770,6 +875,57 @@ int shadow_mode_enable(struct domain *d, unsigned int mode) return rc; } +static void +translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn) +{ + int i; + l1_pgentry_t *l1; + + l1 = map_domain_mem(l1mfn << PAGE_SHIFT); + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + { + if ( is_guest_l1_slot(i) && + (l1_pgentry_val(l1[i]) & _PAGE_PRESENT) ) + { + unsigned long mfn = l1_pgentry_val(l1[i]) >> PAGE_SHIFT; + unsigned long gpfn = __mfn_to_gpfn(d, mfn); + ASSERT((l1_pgentry_val(p2m[gpfn]) >> PAGE_SHIFT) == mfn); + l1[i] = mk_l1_pgentry((gpfn << PAGE_SHIFT) | + (l1_pgentry_val(l1[i]) & ~PAGE_MASK)); + } + } + unmap_domain_mem(l1); +} + +// This is not general enough to handle arbitrary pagetables +// with shared L1 pages, etc., but it is sufficient for bringing +// up dom0. +// +void +translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn) +{ + int i; + l2_pgentry_t *l2; + + ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d)); + + l2 = map_domain_mem(l2mfn << PAGE_SHIFT); + for (i = 0; i < L2_PAGETABLE_ENTRIES; i++) + { + if ( is_guest_l2_slot(i) && + (l2_pgentry_val(l2[i]) & _PAGE_PRESENT) ) + { + unsigned long mfn = l2_pgentry_val(l2[i]) >> PAGE_SHIFT; + unsigned long gpfn = __mfn_to_gpfn(d, mfn); + ASSERT((l1_pgentry_val(p2m[gpfn]) >> PAGE_SHIFT) == mfn); + l2[i] = mk_l2_pgentry((gpfn << PAGE_SHIFT) | + (l2_pgentry_val(l2[i]) & ~PAGE_MASK)); + translate_l1pgtable(d, p2m, mfn); + } + } + unmap_domain_mem(l2); +} + static void free_shadow_ht_entries(struct domain *d) { struct shadow_status *x, *n; @@ -1018,6 +1174,42 @@ void vmx_shadow_clear_state(struct domain *d) shadow_unlock(d); } +static unsigned long +gpfn_to_mfn_safe(struct domain *d, unsigned long gpfn) +{ + ASSERT( shadow_mode_translate(d) ); + + perfc_incrc(gpfn_to_mfn_safe); + + unsigned long va = gpfn << PAGE_SHIFT; + unsigned long phystab = pagetable_val(d->arch.phys_table); + l2_pgentry_t *l2 = map_domain_mem(phystab); + l2_pgentry_t l2e = l2[l2_table_offset(va)]; + unmap_domain_mem(l2); + if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) + { + printk("gpfn_to_mfn_safe(d->id=%d, gpfn=%p) => 0 l2e=%p\n", + d->id, gpfn, l2_pgentry_val(l2e)); + return 0; + } + unsigned long l1tab = l2_pgentry_val(l2e) & PAGE_MASK; + l1_pgentry_t *l1 = map_domain_mem(l1tab); + l1_pgentry_t l1e = l1[l1_table_offset(va)]; + unmap_domain_mem(l1); + + printk("gpfn_to_mfn_safe(d->id=%d, gpfn=%p) => %p phystab=%p l2e=%p l1tab=%p, l1e=%p\n", + d->id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, phystab, l2e, l1tab, l1e); + + if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) + { + printk("gpfn_to_mfn_safe(d->id=%d, gpfn=%p) => 0 l1e=%p\n", + d->id, gpfn, l1_pgentry_val(l1e)); + return 0; + } + + return l1_pgentry_val(l1e) >> PAGE_SHIFT; +} + static unsigned long shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned long smfn) @@ -1037,9 +1229,7 @@ shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn, perfc_incrc(shadow_hl2_table_count); - ASSERT( pagetable_val(current->arch.guest_table) == (gmfn << PAGE_SHIFT) ); - gl2 = current->arch.guest_vtable; - + gl2 = map_domain_mem(gmfn << PAGE_SHIFT); hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT); if ( shadow_mode_external(d) ) @@ -1047,19 +1237,48 @@ shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn, else limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; - for ( i = 0; i < limit; i++ ) + if ( unlikely(current->domain != d) && !shadow_mode_external(d) ) { - unsigned long gl2e = l2_pgentry_val(gl2[i]); - unsigned long mfn; + // Can't use __gpfn_to_mfn() if we don't have one of this domain's + // page tables currently installed. What a pain in the neck! + // + // This isn't common -- it only happens during shadow mode setup + // and mode changes. + // + perfc_incrc(shadow_hl2_other_domain); + for ( i = 0; i < limit; i++ ) + { + unsigned long gl2e = l2_pgentry_val(gl2[i]); + unsigned long mfn; - if ( gl2e & _PAGE_PRESENT ) + if ( (gl2e & _PAGE_PRESENT) && + (mfn = gpfn_to_mfn_safe(d, gl2e >> PAGE_SHIFT)) ) + { + hl2[i] = mk_l1_pgentry((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + get_page(pfn_to_page(mfn), d); + } + else + { + hl2[i] = mk_l1_pgentry(0); + } + } + } + else + { + for ( i = 0; i < limit; i++ ) { - mfn = __gpfn_to_mfn(d, gl2e >> PAGE_SHIFT); - hl2[i] = mk_l1_pgentry((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - get_page(pfn_to_page(mfn), d); + unsigned long gl2e = l2_pgentry_val(gl2[i]); + unsigned long mfn; + + if ( (gl2e & _PAGE_PRESENT) && + (mfn = __gpfn_to_mfn(d, gl2e >> PAGE_SHIFT)) ) + { + hl2[i] = mk_l1_pgentry((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + get_page(pfn_to_page(mfn), d); + } + else + hl2[i] = mk_l1_pgentry(0); } - else - hl2[i] = mk_l1_pgentry(0); } if ( !shadow_mode_external(d) ) @@ -1078,6 +1297,7 @@ shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn, } unmap_domain_mem(hl2); + unmap_domain_mem(gl2); return hl2mfn; } @@ -1122,10 +1342,23 @@ static unsigned long shadow_l2_table( &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + + spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = + mk_l2_pgentry(__pa(page_get_owner( + &frame_table[gmfn])->arch.mm_perdomain_pt) | + __PAGE_HYPERVISOR); + if ( shadow_mode_translate(d) ) // NB: not external { unsigned long hl2mfn; - if ( unlikely(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow)) ) + + spl2e[l2_table_offset(RO_MPT_VIRT_START)] = + mk_l2_pgentry(pagetable_val(d->arch.phys_table) | + __PAGE_HYPERVISOR); + + if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); // shadow_mode_translate (but not external) sl2 tables hold a @@ -1140,14 +1373,6 @@ static unsigned long shadow_l2_table( else spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - - spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = - mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - - spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = - mk_l2_pgentry(__pa(page_get_owner( - &frame_table[gmfn])->arch.mm_perdomain_pt) | - __PAGE_HYPERVISOR); } else { @@ -1304,7 +1529,7 @@ shadow_alloc_oos_entry(struct domain *d) return f; } -static unsigned long +static inline unsigned long shadow_make_snapshot( struct domain *d, unsigned long gpfn, unsigned long gmfn) { @@ -1554,11 +1779,11 @@ static u32 remove_all_write_access_in_ptpage( } u32 shadow_remove_all_write_access( - struct domain *d, unsigned min_type, unsigned max_type, unsigned long gpfn) + struct domain *d, unsigned min_type, unsigned max_type, + unsigned long gpfn, unsigned long gmfn) { int i; struct shadow_status *a; - unsigned long gmfn = __gpfn_to_mfn(d, gpfn); unsigned long sl1mfn = __shadow_status(d, gpfn, PGT_l1_shadow); u32 count = 0; @@ -2004,6 +2229,8 @@ void __update_pagetables(struct exec_domain *ed) ASSERT( shadow_mode_translate(d) ); + BUG(); // ref counts for hl2mfn and smfn need to be maintained! + mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); @@ -2038,10 +2265,10 @@ int shadow_status_noswap; #define FAIL(_f, _a...) \ do { \ - printk("XXX %s-FAIL (%d,%d)" _f "\n" \ + printk("XXX %s-FAIL (%d,%d,%d)" _f "\n" \ "g=%08lx s=%08lx &g=%08lx &s=%08lx" \ " v2m(&g)=%08lx v2m(&s)=%08lx ea=%08lx\n", \ - sh_check_name, level, l1_idx, ## _a , \ + sh_check_name, level, l2_idx, l1_idx, ## _a , \ gpte, spte, pgpte, pspte, \ v2m(pgpte), v2m(pspte), \ (l2_idx << L2_PAGETABLE_SHIFT) | \ @@ -2076,7 +2303,8 @@ static int check_pte( if ( (spte & mask) != (gpte & mask) ) FAIL("Corrupt?"); - if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) && !oos_ptes ) + if ( (level == 1) && + (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) && !oos_ptes ) FAIL("Dirty coherence"); if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) && !oos_ptes ) @@ -2090,23 +2318,28 @@ static int check_pte( if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) && !oos_ptes ) { - printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n", + printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d " + "oos_ptes=%d\n", gpfn, gmfn, smfn, frame_table[gmfn].u.inuse.type_info, page_table_page, oos_ptes); FAIL("RW coherence"); } - if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) && !oos_ptes ) + if ( (level == 1) && + (spte & _PAGE_RW ) && + !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) && + !oos_ptes ) { - printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n", + printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d " + "oos_ptes=%d\n", gpfn, gmfn, smfn, frame_table[gmfn].u.inuse.type_info, page_table_page, oos_ptes); FAIL("RW2 coherence"); } - if ( gpfn == smfn ) + if ( gmfn == smfn ) { if ( level > 1 ) FAIL("Linear map ???"); /* XXX this will fail on BSD */ @@ -2280,8 +2513,8 @@ int _check_pagetable(struct exec_domain *ed, char *s) sh_l2_present = sh_l1_present = 0; perfc_incrc(check_pagetable); - ptbase_pfn = gptbase >> PAGE_SHIFT; - ptbase_mfn = __gpfn_to_mfn(d, ptbase_pfn); + ptbase_mfn = gptbase >> PAGE_SHIFT; + ptbase_pfn = __mfn_to_gpfn(d, ptbase_mfn); if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) ) { diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 8516e0b59b..ea5dc63673 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -298,7 +298,7 @@ asmlinkage int do_page_fault(struct xen_regs *regs) } if ( unlikely(shadow_mode_enabled(d)) && - ((addr < PAGE_OFFSET) || shadow_mode_external(d)) && + ((addr < HYPERVISOR_VIRT_START) || shadow_mode_external(d)) && shadow_fault(addr, regs) ) { return EXCRET_fault_fixed; diff --git a/xen/arch/x86/vmx_io.c b/xen/arch/x86/vmx_io.c index 763c9d5d28..fcecddcb92 100644 --- a/xen/arch/x86/vmx_io.c +++ b/xen/arch/x86/vmx_io.c @@ -391,7 +391,7 @@ void vmx_do_resume(struct exec_domain *d) __vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table)); else // paging is not enabled in the guest - __vmwrite(GUEST_CR3, pagetable_val(d->arch.phys_table)); + __vmwrite(GUEST_CR3, pagetable_val(d->domain->arch.phys_table)); __vmwrite(HOST_CR3, pagetable_val(d->arch.monitor_table)); __vmwrite(HOST_ESP, (unsigned long)get_stack_bottom()); diff --git a/xen/arch/x86/x86_32/domain_build.c b/xen/arch/x86/x86_32/domain_build.c index a0b2b94808..b039a87aa6 100644 --- a/xen/arch/x86/x86_32/domain_build.c +++ b/xen/arch/x86/x86_32/domain_build.c @@ -50,6 +50,7 @@ int construct_dom0(struct domain *d, char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */ int shadow_dom0 = 1; // HACK ALERT !! Force dom0 to run in shadow mode. + int translate_dom0 = 1; // HACK ALERT !! Force dom0 to run in shadow translate mode /* * This fully describes the memory layout of the initial domain. All @@ -73,6 +74,7 @@ int construct_dom0(struct domain *d, unsigned long mpt_alloc; extern void physdev_init_dom0(struct domain *); + extern void translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn); /* Sanity! */ if ( d->id != 0 ) @@ -314,7 +316,7 @@ int construct_dom0(struct domain *d, d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; d->shared_info->n_vcpu = smp_num_cpus; - /* setup shadow and monitor tables */ + /* setup monitor table */ update_pagetables(ed); /* Install the new page tables. */ @@ -388,9 +390,27 @@ int construct_dom0(struct domain *d, new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start); - if ( shadow_dom0 ) + if ( shadow_dom0 || translate_dom0 ) { - shadow_mode_enable(d, SHM_enable); + shadow_mode_enable(d, (translate_dom0 + ? SHM_enable | SHM_translate + : SHM_enable)); + if ( translate_dom0 ) + { + // map this domain's p2m table into current page table, + // so that we can easily access it. + // + ASSERT( root_pgentry_val(idle_pg_table[1]) == 0 ); + ASSERT( pagetable_val(d->arch.phys_table) ); + idle_pg_table[1] = mk_root_pgentry( + pagetable_val(d->arch.phys_table) | __PAGE_HYPERVISOR); + translate_l2pgtable(d, (l1_pgentry_t *)(1u << L2_PAGETABLE_SHIFT), + pagetable_val(ed->arch.guest_table) + >> PAGE_SHIFT); + idle_pg_table[1] = mk_root_pgentry(0); + local_flush_tlb(); + } + update_pagetables(ed); /* XXX SMP */ } diff --git a/xen/arch/x86/x86_64/domain_build.c b/xen/arch/x86/x86_64/domain_build.c index be7fc6c8c8..738729f83e 100644 --- a/xen/arch/x86/x86_64/domain_build.c +++ b/xen/arch/x86/x86_64/domain_build.c @@ -328,7 +328,7 @@ int construct_dom0(struct domain *d, d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; d->shared_info->n_vcpu = smp_num_cpus; - /* Set up shadow and monitor tables. */ + /* Set up monitor table. */ update_pagetables(ed); /* Install the new page tables. */ diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index 02276cbb55..ea11b9ff71 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -50,6 +50,8 @@ struct arch_domain struct out_of_sync_entry *out_of_sync_extras; unsigned int out_of_sync_extras_count; + pagetable_t phys_table; /* guest 1:1 pagetable */ + } __cacheline_aligned; struct arch_exec_domain @@ -115,8 +117,6 @@ struct arch_exec_domain pagetable_t shadow_table; /* (MA) shadow of guest */ pagetable_t monitor_table; /* (MA) used in hypervisor */ - pagetable_t phys_table; /* guest 1:1 pagetable */ - l2_pgentry_t *guest_vtable; /* virtual address of pagetable */ l2_pgentry_t *shadow_vtable; /* virtual address of shadow_table */ l2_pgentry_t *monitor_vtable; /* virtual address of monitor_table */ diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 4ae9f09f08..49f8932a76 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -155,7 +155,7 @@ void free_page_type(struct pfn_info *page, unsigned int type); extern void invalidate_shadow_ldt(struct exec_domain *d); extern u32 shadow_remove_all_write_access( struct domain *d, unsigned min_type, unsigned max_type, - unsigned long gpfn); + unsigned long gpfn, unsigned long gmfn); extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn); static inline void put_page(struct pfn_info *page) @@ -361,15 +361,15 @@ int audit_adjust_pgtables(struct domain *d, int dir, int noisy); #define AUDIT_ERRORS_OK ( 1u << 1 ) #define AUDIT_QUIET ( 1u << 2 ) -void _audit_domain(struct domain *d, int flags, const char *file, int line); -#define audit_domain(_d) _audit_domain((_d), 0, __FILE__, __LINE__) +void _audit_domain(struct domain *d, int flags); +#define audit_domain(_d) _audit_domain((_d), 0) void audit_domains(void); #else -#define _audit_domain(_d, _f, _file, _line) ((void)0) -#define audit_domain(_d) ((void)0) -#define audit_domains() ((void)0) +#define _audit_domain(_d, _f) ((void)0) +#define audit_domain(_d) ((void)0) +#define audit_domains() ((void)0) #endif diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h index a17c601611..ffbe9042e5 100644 --- a/xen/include/asm-x86/shadow.h +++ b/xen/include/asm-x86/shadow.h @@ -169,7 +169,8 @@ static inline void shadow_mode_disable(struct domain *d) #define __gpfn_to_mfn(_d, gpfn) \ ( (shadow_mode_translate(_d)) \ - ? phys_to_machine_mapping(gpfn) \ + ? ({ ASSERT(current->domain == (_d)); \ + phys_to_machine_mapping(gpfn); }) \ : (gpfn) ) /************************************************************************/ @@ -541,10 +542,6 @@ static inline void l1pte_propagate_from_guest( unsigned long mfn = __gpfn_to_mfn(d, pfn); unsigned long spte; -#if SHADOW_VERBOSE_DEBUG - unsigned long old_spte = *spte_p; -#endif - spte = 0; if ( mfn && @@ -560,12 +557,12 @@ static inline void l1pte_propagate_from_guest( spte &= ~_PAGE_RW; } } +#if 0 -#if SHADOW_VERBOSE_DEBUG - if ( old_spte || spte || gpte ) - SH_VLOG("l1pte_propagate_from_guest: gpte=0x%p, old spte=0x%p, new spte=0x%p", gpte, old_spte, spte); -#endif + if ( spte || gpte ) + SH_VLOG("%s: gpte=0x%p, new spte=0x%p", __func__, gpte, spte); +#endif *spte_p = spte; } @@ -582,7 +579,7 @@ static inline void l2pde_general( if ( (gpde & _PAGE_PRESENT) && (sl1mfn != 0) ) { spde = (gpde & ~PAGE_MASK) | (sl1mfn << PAGE_SHIFT) | - _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY; + _PAGE_RW | _PAGE_ACCESSED; gpde |= _PAGE_ACCESSED; /* N.B. PDEs do not have a dirty bit. */ // XXX mafetter: Hmm... @@ -592,6 +589,9 @@ static inline void l2pde_general( *gpde_p = gpde; } + if ( spde || gpde ) + SH_VLOG("%s: gpde=0x%p, new spde=0x%p", __func__, gpde, spde); + *spde_p = spde; } @@ -828,7 +828,9 @@ static inline unsigned long ___shadow_status( static inline unsigned long __shadow_status( struct domain *d, unsigned long gpfn, unsigned long stype) { - unsigned long gmfn = __gpfn_to_mfn(d, gpfn); + unsigned long gmfn = ((current->domain == d) + ? __gpfn_to_mfn(d, gpfn) + : 0); ASSERT(spin_is_locked(&d->arch.shadow_lock)); ASSERT(gpfn == (gpfn & PGT_mfn_mask)); @@ -843,7 +845,7 @@ static inline unsigned long __shadow_status( return 0; } - return ___shadow_status(d, gmfn, stype); + return ___shadow_status(d, gpfn, stype); } /* @@ -1014,14 +1016,15 @@ static inline void delete_shadow_status( } static inline void set_shadow_status( - struct domain *d, unsigned long gpfn, + struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned long smfn, unsigned long stype) { struct shadow_status *x, *head, *extra; int i; - unsigned long gmfn = __gpfn_to_mfn(d, gpfn); unsigned long key = gpfn | stype; + SH_VVLOG("set gpfn=%p gmfn=%p smfn=%p t=%p\n", gpfn, gmfn, smfn, stype); + ASSERT(spin_is_locked(&d->arch.shadow_lock)); ASSERT(gpfn && !(gpfn & ~PGT_mfn_mask)); ASSERT(pfn_is_ram(gmfn)); // XXX need to be more graceful diff --git a/xen/include/xen/perfc_defn.h b/xen/include/xen/perfc_defn.h index 058265c416..34c0271db9 100644 --- a/xen/include/xen/perfc_defn.h +++ b/xen/include/xen/perfc_defn.h @@ -66,3 +66,5 @@ PERFCOUNTER_CPU(validate_pte_calls, "calls to validate_pte_change PERFCOUNTER_CPU(validate_pte_changes, "validate_pte makes changes") PERFCOUNTER_CPU(validate_pde_calls, "calls to validate_pde_change") PERFCOUNTER_CPU(validate_pde_changes, "validate_pde makes changes") +PERFCOUNTER_CPU(shadow_hl2_other_domain, "shadow_hl2 from other domain") +PERFCOUNTER_CPU(gpfn_to_mfn_safe, "calls to gpfn_to_mfn_safe") -- 2.30.2